clear all
set more off
set mem 10000000
set matsize 10000
version 13

****************************************************************** 
*** Build File to Process Raw Census 2001 Village Directory ******
****************************************************************** 

** Set file paths
do "$path_code/paths.do"

********************************************************************************
********************************************************************************

* Combine all csv into a single raw vd
{
clear
cd "$vd01/All"
cap erase "$vd01/vd_2001_all_raw.dta"
local allfiles : dir . files "*.csv" 
foreach f of local allfiles {
  cap insheet using `f', comma names double clear
  foreach var of varlist *{
    cap tostring `var', replace force
  }
  cap append using "$vd01/vd_2001_all_raw.dta", force
  cap save "$vd01/vd_2001_all_raw.dta", replace
  di "`f'"
}
}

* Clean raw vd
{
use "$vd01/vd_2001_all_raw.dta", clear
drop v70
foreach var of varlist *{
  replace `var' = trim(`var')
}
compress
duplicates tag, gen(dupes)
sort *
duplicates drop
drop if st_code==st_code[1]
drop if st_code=="ST_CODE" | st_code=="ST_Code"
drop dupes

duplicates tag st_code-ss_code, gen(dupes)
replace v_ct_code = subinstr(v_ct_code,"O","0",.)
replace tot_inc = subinstr(tot_inc,",","",.)
replace tot_inc = "" if tot_inc=="."
replace tot_exp = subinstr(tot_exp,",","",.)
replace tot_exp = "" if tot_exp=="."
replace land_fores = subinstr(land_fores,"N.A","",.)
foreach var of varlist land_fores-jhum_cul {
  replace `var' = "" if `var'=="."
}

gen byte notnumeric = regexm(canal_pvt, "^[-+]?[0-9]*\.?[0-9]+$")==0
tab canal_pvt if notnumeric==1
drop notnumeric

destring st_code - v_ct_code area - app_nw dist_town - tot_exp land_fores-jhum_cul, replace
drop if st_code == .

compress
save "$vd01/vd_2001_all_raw.dta", replace
}

* Process raw vd
{
use "$vd01/vd_2001_all_raw.dta", clear
duplicates drop st_code dist_code thsil_code block_code v_ct_code vill_name, force
rename dist_code dt_code 
rename thsil_code bk_code1_vd
rename block_code bk_code2_vd
rename v_ct_code vi_code
rename vill_name village_vd
rename t_p t_p_vd
replace village = upper(trim(itrim(village)))
duplicates drop
gen vd01_id = _n
order vd01_id

foreach v of varlist edu_fac medi_fac drnk p_t_fac tap-spring other comm_fac bs_fac rs_fac nw_fac crsoc_fac rc_fac bank_fac app_pr-app_nw power* pap_mag a_incexp {
	replace `v' = . if `v'==0 | `v'==3 | `v'==8 | `v'<0
	replace `v' = 0 if `v'==2
	di "`v'"
	assert `v'==0 | `v'==1 | `v'==.
}
foreach v of varlist sou_summ ss_code near_town man_comm* {
	replace `v' = upper(trim(itrim(`v')))
}

replace news_pap = "1" if regexm(upper(news_pap),"N")
replace news_pap = "" if news_pap!="1"
replace magazine = "1" if regexm(upper(magazine),"M")
replace magazine = "" if magazine!="1"
destring news_pap magazine, replace force

foreach v of varlist area-area_na_cu {
  rename `v' `v'_01
}
}


* Label variables
{
cap la var vd01_id			"2001 VD id"
cap la var st_code 			"2001 state code"
cap la var dt_code 			"2001 district code"
cap la var bk_code1_vd		"2001 block code (VD 1)"
cap la var bk_code2_vd		"2001 block code (VD 2)"
cap la var vi_code 			"2001 village code"
cap la var village_vd		"Village name (VD)"
cap la var area_01 			"Village area (hectares)"
cap la var edu_fac_01 		"Education facilities (Y/N)"
cap la var p_sch_01	 		"Number of primary schools"
cap la var rang_p_sch_01 	"Range to nearest primary school (1 = <5km, 2 = 5-10km, 3 = >10km)"
cap la var m_sch_01	 		"Number of middle schools"
cap la var rang_m_sch_01 	"Range to nearest middle school (1 = <5km, 2 = 5-10km, 3 = >10km)"
cap la var s_sch_01	 		"Number of secondary schools"
cap la var s_s_sch_01	 	"Number of senior secondary schools"
cap la var college_01	 	"Number of colleges"
cap la var rang_coll_01 	"Range to nearest college (1 = <5km, 2 = 5-10km, 3 = >10km)"
cap la var ind_sch_01 		"Number of industrial schools"
cap la var tr_sch_01 		"Number of training schools"
cap la var adlt_lt_ct_01 	"Number of adult literacy centers"
cap la var oth_sch_01 		"Number of other educational facilities"
cap la var medi_fac_01 		"Medical facilities (Y/N)"
cap la var all_hosp_01		"Number of allopathic hospitals"
cap la var rang_all_01		"Range to nearest allopathic hospital (1 = <5km, 2 = 5-10km, 3 = >10km)"
cap la var ayu_hosp_01		"Number of ayurvedic hospitals"
cap la var un_hosp_01		"Number of unani hospitals"
cap la var hom_hosp_01		"Number of homeopathic hospitals"
cap la var all_disp_01		"Number of allopathic dispensaries"
cap la var ayu_disp_01		"Number of ayurvedic dispensaries"
cap la var un_disp_01		"Number of unani dispensaries"
cap la var hom_disp_01		"Number of homeopathic dispensaries"
cap la var mcw_cntr_01		"Number of maternity and child welfare centers"
cap la var rang_mcw_01		"Range to nearest maternity & cw center (1 = <5km, 2 = 5-10km, 3 = >10km)"
cap la var m_home_01		"Number of maternity homes"
cap la var cwc_01			"Number of child welfare centers"
cap la var h_cntr_01		"Number of health centers"
cap la var ph_cntr_01		"Number of primary health centers"
cap la var rang_phc_01		"Range to nearest primary health center (1 = <5km, 2 = 5-10km, 3 = >10km)"
cap la var phs_cnt_01		"Number of primary health sub-centers"
cap la var fwc_cntr_01		"Number of family welfare centers"
cap la var tb_cln_01		"Number of T.B. clinics"
cap la var n_home_01		"Number of nursing homes"
cap la var rmp_01			"Number of registered private medical practitioners"
cap la var smp_01			"Number of subsidized medical practitioners"
cap la var chw_01			"Number of community health workers"
cap la var oth_cntr_01		"Number of other medical facilities"
cap la var drnk_wat_f_01 	"Drinking water facilities (Y/N)"
cap la var rang_wat_f_01 	"Range to nearest drinking water source (1 = <5km, 2 = 5-10km, 3 = >10km)"
cap la var tap_01		 	"Tap water (Y/N)"
cap la var well_01		 	"Well water (Y/N)"
cap la var tank_01		 	"Tank water (Y/N)"
cap la var tubewell_01		"Tubewell water (Y/N)"
cap la var handpump_01		"Handpump water (Y/N)"
cap la var river_01			"River water (Y/N)"
cap la var canal_01			"Canal water (Y/N)"
cap la var lake_01			"Lake water (Y/N)"
cap la var spring_01		"Spring water (Y/N)"
cap la var other_01			"Other water source (Y/N)"
cap la var sou_summ_01		"Source of summer water"
cap la var rang_ss_01		"Range to nearest summer drinking water source (1 = <5km, 2 = 5-10km, 3 = >10km)" 
cap la var ss_code_01		"Nearest source code"
cap la var p_t_fac_01	 	"Post, telegraph, telephone facilities (Y/N)"
cap la var post_off_01	 	"Number of post offices"
cap la var rang_po_01	 	"Range to nearest summer post office (1 = <5km, 2 = 5-10km, 3 = >10km)" 
cap la var tele_off_01	 	"Number of telegraph offices"
cap la var post_tele_01	 	"Number of post and telegraph offices"
cap la var phone_01	 		"Number of telephone connections"
cap la var rang_phone_01	"Range to nearest telephone connection (1 = <5km, 2 = 5-10km, 3 = >10km)"
cap la var comm_fac_01	 	"Communication facilities (Y/N)"
cap la var bs_fac_01	 	"Bus service (Y/N)"
cap la var rang_bs_01 		"Range to nearest bus service (1 = <5km, 2 = 5-10km, 3 = >10km)"
cap la var rs_fac_01	 	"Rail service (Y/N)"
cap la var rang_rs_01 		"Range to nearest rail service (1 = <5km, 2 = 5-10km, 3 = >10km)" 
cap la var nw_fac_01	 	"Navigable waterway, incl. river, canal, etc. (Y/N)"
cap la var rang_nw_01 		"Range to nearest navigable waterway (1 = <5km, 2 = 5-10km, 3 = >10km)" 
cap la var bank_fac_01		"Banking facilities (Y/N)"
cap la var comm_bank_01		"Number of commercial banks"
cap la var rang_comm_01		"Range to nearest commercial bank (1 = <5km, 2 = 5-10km, 3 = >10km)"
cap la var coop_bank_01		"Number of cooperative commercial banks"
cap la var rang_coop_01		"Range to nearest coopoerative commercial bank (1 = <5km, 2 = 5-10km, 3 = >10km)"
cap la var crsoc_fac_01		"Credit societies (Y/N)"
cap la var ac_soc_01		"Number of agricultural credit societies"
cap la var rang_acs_01		"Range to nearest agricultural credit society (1 = <5km, 2 = 5-10km, 3 = >10km)"
cap la var nac_soc_01		"Number of non-agricultural credit societies"
cap la var rang_nac_01		"Range to nearest non-agricultural credit society (1 = <5km, 2 = 5-10km, 3 = >10km)"
cap la var other_soc_01		"Number of other credit societies"
cap la var rang_oth_01		"Range to nearest other credit society (1 = <5km, 2 = 5-10km, 3 = >10km)"
cap la var rc_fac_01		"Recreational and cultural facilities (Y/N)"
cap la var c_v_hall_01		"Number of cinema/video halls"
cap la var rang_cv_01		"Range to nearest other cinema (1 = <5km, 2 = 5-10km, 3 = >10km)"
cap la var sp_cl_fac_01		"Number of sports clubs"
cap la var rang_spcl_01		"Range to nearest sports club (1 = <5km, 2 = 5-10km, 3 = >10km)"
cap la var st_au_fac_01		"Number of stadiums/auditoriums"
cap la var rang_stau_01		"Range to nearest stadium/auditorium (1 = <5km, 2 = 5-10km, 3 = >10km)"
cap la var app_pr_01		"Approach on paved road (Y/N)"
cap la var app_mr_01		"Approach on mud road (Y/N)"
cap la var app_fp_01		"Approach on foot path (Y/N)"
cap la var app_navriv_01	"Approach on navigable river (Y/N)"
cap la var app_navcan_01	"Approach on navigable canal (Y/N)"
cap la var app_nw_01		"Approach on other navigable waterway (Y/N)"
cap la var near_town_01		"Nearest town"
cap la var dist_town_01		"Distance to nearest town (km)"
cap la var power_supl_01	"Electricity supply (Y/N)"
cap la var power_dom_01		"Electricity for domestic use (but not all uses) (Y/N)"
cap la var power_agr_01		"Electricity for agricultural use (but not all uses) (Y/N)"
cap la var power_oth_01		"Electricity for other purposes (but not all uses) (Y/N)"
cap la var power_all_01		"Electricity for all purposes (Y/N)"
cap la var pap_mag_01		"Newspaper or magazine (Y/N)"
cap la var news_pap_01		"Newspaper (Y/N)"
cap la var magazine_01		"Magazine (Y/N)"
cap la var a_incexp_01		"Separate income and expenditure figures? (Y/N)"
cap la var man_comm1_01		"Manufactured commodity 1"
cap la var man_comm2_01		"Manufactured commodity 2"
cap la var man_comm3_01		"Manufactured commodity 3"
cap la var tot_inc_01    	"Total income (100 Rupees)"
cap la var tot_exp_01    	"Total expenditure (100 Rupees)"
cap la var land_fores_01 	"Area of forest (hectares)"
cap la var canal_govt_01 	"Area irrigated by government canal (hectares)"
cap la var canal_pvt_01 	"Area irrigated by private canal (hectares)"
cap la var well_wo_el_01	"Area irrigated by well, w/o electricity (hectares)"
cap la var well_w_el_01		"Area irrigated by well, w/ electricity (hectares)"
cap la var tw_wo_el_01		"Area irrigated by tubewell, w/o electricity (hectares)"
cap la var tw_w_el_01		"Area irrigated by tubewell, w/ electricity (hectares)"
cap la var tank_irr_01		"Area irrigated by tank (hectares)"
cap la var river_irr_01		"Area irrigated by river (hectares)"
cap la var lake_irr_01		"Area irrigated by lake (hectares)"
cap la var w_fall_01		"Area irrigated by waterfall (hectares)"
cap la var oth_irr_01		"Area irrigated by other source (hectares)"
cap la var tot_irr_01		"Area irrigated (hectares)"
cap la var un_irr_01		"Area unirrigated (hectares)"
cap la var cult_waste_01	"Area of cultural waste (hectares)"
cap la var area_na_cu_01	"Area not available for cultivation (hectares)"
}

* Save as 2001 VD dataset
{
drop t_hh_01 t_m_01-st_f_01 jhum_cul-dupes
rename t_p_vd_01 t_p_vd
drop if t_p_vd==0
la var t_p_vd "2001 VD population"
drop if inlist(st_code,4,7,25,26,30,31,34,35)  // drop non-RGGVY states
duplicates drop
sort st_code dt_code vi_code
compress
save "$vd01/vd_2001.dta", replace

keep st_code dt_code bk_code* vi_code village_vd t_p_vd vd01_id
save "$vd01/vd_2001_names.dta", replace

}

********************************************************************************
********************************************************************************